---
title: "Calculating Legibility from Data on Ages"
author: "Volha Charnysh "
input: "Ages1897_TypedVC.xlsx"
output: Figure 4 (two parts) and age heaping index for all districts in European Russia
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# File Formatting

```{r}
library(readxl)

# read file
lst <- lapply(1:50, function(i) read_excel("Ages1897_TypedVC.xlsx", sheet = i))

# clean
library(plyr)
library(dplyr)


lst_formatted <- list()
for (i in 1:50) {
  df <- t(lst[[i]]) # transpose
  df[is.na(df)] <- 0 # replace NA with 0
  colnames(df) <- df[1,] # input column names
  df <- as.data.frame(df) # convert to dataframe
  df <- df[-1, ] # remove first row (column names)
  df <- df[, -c(1, 3)] # remove character columns
  
  # save observations as numeric (used to be factor levels)
  df <- data.frame(lapply(df, as.character), stringsAsFactors=FALSE)
  df <- data.frame(lapply(df, as.numeric), stringsAsFactors=FALSE)
  
  # sum by each ter_ID
  df <- ddply(df, "ter_ID", numcolwise(sum))
  
  # remove columns containing "SUM"
  df <- df[!grepl("^SUM", colnames(df))]
  
  # save in list
  lst_formatted[[i]] <- df
}

# check if the number of columns in each original sheet
# matches with the number of rows in each reformatted list item 
#sometimes if a zero or random character occurred in columns w/out data it would be an issue here.
for (i in 1:50) {
  print((ncol(lst[[i]])-1)/2)
  print(nrow(lst_formatted[[i]]))
}

# check that all sheets have 113 columns
for (i in 1:50) {
  print(i)
  print(ncol(lst_formatted[[i]]))
}

# calculate total number of uezds
total_uezds <- c()
for (i in 1:50) {
  total_uezds[i] <- (ncol(lst[[i]])-1)/2
}
sum(total_uezds) #N=501

uezds_all <- do.call("rbind", lst_formatted)

###############################################
### Frequency of different ages by district ###
###############################################

Ages<-colSums(uezds_all[,4:ncol(uezds_all)-1]) #Here watch out for columns that are below 1 or contain ter_id

#pdf(file="ageCurve50gub.pdf")
plot(Ages, pch=20, ylab="Frequency", xlab="Age")
grid (NULL,NULL, lty = 6, col = "cornsilk2") 
#dev.off()
```


# Calculating Whipple's and Myers' Indices

```{r}
# function for calculating Whipple's and Myers' Indices
# for each uezd
heaping.fun <- function(dat, age.low, age.high, top.age, start.age){
  ## dat: a uezd's age info
  ## age.low: 10th youngest age from the start, 
  ##          e.g., if start-age is 0, age.low would be 9
  ## age.high: 10th oldest age from the end, 
  ##          e.g., if end-age is 109, age.high would be 100
  ## top.age: oldest age in the dataset (e.g. 109 or 75)
  ## start.age: youngest age in the dataset (e.g. 0 or 15)
  
  # create matrix to save current uezd's info
  uezd <- matrix(data = NA, nrow = top.age-start.age+1, ncol = 2)
  colnames(uezd) <- c("age", "pop")
  
  # save uezd info into a column matrix
  each_row <- t(dat)
  
  # save column matrix into uezd matrix
  uezd[, 1] <- c(start.age:top.age)
  uezd[, 2] <- c(each_row)
  # convert to dataframe
  uezd <- as.data.frame(uezd)
  # convert columns to numeric
  uezd[, 1] <- as.numeric(as.character(uezd[, 1]))
  uezd[, 2] <- as.numeric(as.character(uezd[, 2]))
  # extract terminal digit
  uezd <- mutate(uezd, terminal = age %% 10)
  
  # Whipple's
  ## get total population
  total <- sum(uezd$pop)
  ## summarize by terminal digit
  whipple <- uezd %>% 
    dplyr::group_by(terminal) %>% 
    dplyr::summarise(freq = sum(pop), pct = 100*freq/total)
  ## calculate Whipple's index
  Whipple_index <- (whipple[1, 2] + whipple[6, 2]) / (1/5 * total) * 100
  Whipple_index <- as.numeric(Whipple_index)
  
  # Myers'
  ## for instance, 0 to 99, 1-100, 2-101, ..., 9-108. 
  ## calculate weights: populations for ages 0 to 8, respectively, are included 1 to 9 times
  ##                    populations for ages 100 to 108, respectively, are included 9 to 1 times
  ##                    populations for all other ages are each counted 10 times
  Myers_uezd <- mutate(uezd, weight = ifelse(age < age.low, age-start.age+1, 
                                             ifelse(age >= age.high, 
                                                    top.age-age, 10)))
  
  ## get total population, 
  ## using population for each age multiplied by how many times it is counted
  total.m <- sum(Myers_uezd$pop * Myers_uezd$weight)
  
  ## generate frequency by terminal digits, same idea as Lee&Zhang Appendix table A1.4
  Myers_uezd.b <- Myers_uezd %>% 
    dplyr::group_by(terminal) %>%
    dplyr::summarize(freq = sum(pop*weight), # same idea as column 5 of Lee&Zhang Appendix table A1.4
                     pct = 100*freq/total.m) # same idea as column 6 of Lee&Zhang Appendix table A1.4
  ## (obviously, the sum of the freq column yields the same total as total.m)
  ## (can be tested using the following code: sum(Myers_uezd.b$freq).)
  
  ## calculate deviation from 10 and divide by 2
  Myers_index <- sum(abs(Myers_uezd.b$pct - 10))/2
  
  return(list(Whipple = Whipple_index, Myers = Myers_index))
}
```

## Calculating Myers and Whipple's for age bin [0-109] and [15-74]

```{r}
# save reformatted data in a new matrix
# 0-109
age_calc.0_109 <- uezds_all
# under 1 = age 0 (X0)
colnames(age_calc.0_109)[2] <- "X0"
# cut age = 110 to make number of ages a multiple of 10: 0 - 109
age_calc.0_109 <- age_calc.0_109[, c(1:(ncol(age_calc.0_109)-2))]

# 15-74
age_calc.15_74 <- uezds_all
# cut ages 0 to 14 and ages 75 to 110
age_calc.15_74 <- age_calc.15_74[, c(1, 17:(ncol(age_calc.15_74)-37) )]

# for saving results
whipples.0_109 <- c()
myers.0_109 <- c()
whipples.15_74 <- c()
myers.15_74 <- c()

# run for loops
for (i in 1:nrow(age_calc.0_109)) {
  # run function for each row (i), uezd, in age_calc.0_109, save Whipple index
  whipples.0_109[i] <- heaping.fun(dat = age_calc.0_109[i, c(2:ncol(age_calc.0_109))],
                                   age.low = 9, age.high = 100, 
                                   top.age = 109, start.age = 0)$Whipple
  # run function for each row (i), uezd, in age_calc.0_109, save Myers index
  myers.0_109[i] <- heaping.fun(dat = age_calc.0_109[i, c(2:ncol(age_calc.0_109))],
                                age.low = 9, age.high = 100, 
                                top.age = 109, start.age = 0)$Myers
  
  # run function for each row (i), uezd, in age_calc.15_74, save Whipple index
  whipples.15_74[i] <- heaping.fun(dat = age_calc.15_74[i, c(2:ncol(age_calc.15_74))],
                                   age.low = 24, age.high = 65, 
                                   top.age = 74, start.age = 15)$Whipple
  # run function for each row (i), uezd, in age_calc.15_74, save Myers index
  myers.15_74[i] <- heaping.fun(dat = age_calc.15_74[i, c(2:ncol(age_calc.15_74))],
                                age.low = 24, age.high = 65, 
                                top.age = 74, start.age = 15)$Myers
}

# save into results matrix, results
results <- matrix(data = NA, nrow = nrow(age_calc.0_109), ncol = 5)
colnames(results) <- c("ter_ID", "Whipple.0_109", "Whipple.15_74", "Myers.0_109", "Myers.15_74")
results[, 1] <- age_calc.0_109$ter_ID
results[, 2] <- whipples.0_109
results[, 3] <- whipples.15_74
results[, 4] <- myers.0_109
results[, 5] <- myers.15_74

# correlation between myers 0-109 and myers 15-74
cor.test(results[, 4], results[, 5]) #Cor = 0.984
```





```{r}
uezds_all$Sum<-rowSums(uezds_all[,2:ncol(uezds_all)-1])
MyersDataset<-merge(results, uezds_all[,c(1, ncol(uezds_all)-1, ncol(uezds_all))], by="ter_ID")

#write.csv(MyersDataset, "AgeHeaping50Gub.csv") 
#Note: subset of this data for famine provinces is already merged with the rest of the data.

##############################################
#### Right panel of Figure 3 in the paper: ###
##############################################

#pdf(file="figures/HistogramMyersRaw50GAllages.pdf")
hist(results[,4],xlab="Myers Index (Raw)", main="")
#dev.off()

plot(density(na.omit(results[,4])), xlab="Myers Index (Raw)")
```



























